# Authors: D.S. Hansen & Gijs Schumacher
# Email: g.schumacher@uva.nl
# Replication code paper "A New Dataset of Dutch and Danish Party Congress Speeches"

# Last updated: February 2019
# R version 3.5.2 (2018-12-20)


# Basic setup -------------------------------------------------------------
rm(list=ls())
library(quanteda) # quanteda_1.4.0
library(stringr) # stringr_1.4.0
library(openxlsx) # openxlsx_4.1.0
sessionInfo() # Check which versions you have installed

# Load danish speech corpus from SURF-drive
load("corpus_DK_NL.Rdata")
text  <- corpus_subset(corpus,country=="DK")

# Load sentiment data. NRC Emotion Lexicon (v.092, downloaded from http://sentiment.nrc.ca/lexicons-for-research/)
sentiment.url <- "http://sentiment.nrc.ca/lexicons-for-research/NRC-Emotion-Lexicon.zip"
temp <- tempfile()
download.file(sentiment.url, temp)
unzip(temp)   
sentiment <- read.xlsx("NRC-Emotion-Lexicon-v0.92/NRC-Emotion-Lexicon-v0.92-In105Languages-Nov2017Translations.xlsx")
sentiment <- sentiment[,c(20,106,107)]

# in case the NRC Emotion Lexicon is no longer available online
#load("sentiment.dk.RData")
#sentiment <- sentiment.dk

# Do sentiment analysis ---------------------------------------------------
dictionary <- dictionary(list(positive = sentiment[which(sentiment[,2]==1),1],
                              negative = sentiment[which(sentiment[,3]==1),1]))

# Making dfm with speech data with positive and negative words
dfm <- dfm(text) 
dfm.sentiment <- dfm(dfm, dictionary=dictionary)

# Total scores
positive <- as.matrix((dfm.sentiment[,1] / ntoken(dfm))*100)
negative <- as.matrix((dfm.sentiment[,2] / ntoken(dfm))*100)
arousal <- as.matrix(positive+negative)
polarity <- as.matrix((positive - negative) / arousal)

danish.speeches.analysis <-  data.frame(docvars(text), positive, negative, polarity, arousal)
colnames(danish.speeches.analysis)[92:93] <- c("polarity", "arousal")

# Create some additional variables --------------------------------------
decade <- substr(danish.speeches.analysis$year,3,3)
temp <- ifelse(substr(danish.speeches.analysis$year,4,4)<5,0,1)
danish.speeches.analysis$fiveyear <- paste0(decade,ifelse(temp==0,paste0("0-",decade,"4"),
                                                          paste0("5-",decade,"9")))
danish.speeches.analysis$fiveyear <- factor(danish.speeches.analysis$fiveyear, levels=unique(danish.speeches.analysis$fiveyear))
danish.speeches.analysis$fiveyear2 <- paste0(decade,ifelse(temp==0,paste0("0"),paste0("5")))
danish.speeches.analysis$fiveyear2 <- factor(danish.speeches.analysis$fiveyear2, levels=unique(danish.speeches.analysis$fiveyear2))

# Write out ---------------------------------------------------------------
#setwd("C:/Users/gschuma1/surfdrive/Papers/Published Work/Congress speeches data paper/Data & Analysis/")
save("danish.speeches.analysis", file = "danish.speeches.analysis.Rdata")


